#remove scientific notation
options(scipen=999)
library(stringr)
library(corrplot)
## corrplot 0.84 loaded
load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")
data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"
# Read in the data
US.deaths <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = T, stringsAsFactors = F)
US.cases <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = T, stringsAsFactors = F)
# Read in the header seprately.
US.cases.head <- read.csv(
paste0(data.path, "time_series_covid19_confirmed_US.csv"),
header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
paste0(data.path, "time_series_covid19_deaths_US.csv"),
header = F, stringsAsFactors = F)[1,]
# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
dates <- sapply(dates, strsplit, split = "/")
dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
dates <- lapply(dates, paste, collapse = "_")
dates <- unlist(dates)
return(dates)
}
dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])
names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)
if(sum(US.cases$UID != US.deaths$UID) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]
US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])
rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]
ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)
nobs <- nrow(US.cases.data)
state.curve <- function(state, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Province_State == state),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Province_State == state),]
}
data.sum <- colSums(data)
day.first.case <- min(which(data.sum > 0))
n.days <- length(data.sum)
barplot(data.sum[day.first.case:n.days],
main = paste0("Total COVID-19 ", stat," by date in ", state),
log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
state.curve("New York", "cases")
state.curve("New York", "deaths")
state.curve("California", "cases")
state.curve("California", "deaths")
state.curve("Oklahoma", "cases")
state.curve("Oklahoma", "deaths")
county.curve <- function(county, stat = c("cases", "deaths")){
if(stat == "cases"){
data <- US.cases.data[which(US.cases$Admin2 == county),]
}else if(stat == "deaths"){
data <- US.deaths.data[which(US.deaths$Admin2 == county),]
}
day.first.case <- min(which(data > 0))
n.days <- length(data)
barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve("Tulsa", "cases")
county.curve("Tulsa", "deaths")
US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)
day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)
par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
deaths.total <- colSums(US.deaths.data)
day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")
avg.rise.cases
rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
rise <- US.cases.data[,i] - US.cases.data[,i-1]
rise.cases[,i-1] <- rise
}
US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)
rise.cases.total <- colSums(rise.cases)
day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)
barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
avg.rise.deaths
rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
rise.deaths[,i-1] <- rise
}
US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)
rise.deaths.total <- colSums(rise.deaths)
day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)
barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)
total.cases
US.stats$total.cases <- US.cases.data[,ndays.cases]
US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)
total.deaths
US.stats$total.deaths <- US.deaths.data[,ndays.deaths]
total.deaths.percap
US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA
max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002565304
total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.
# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA
US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
## UID avg.rise.cases avg.rise.deaths total.cases
## 3203 84090002 0.0000000 0.04444444 0
## 3206 84090006 0.0000000 0.02222222 0
## 3222 84090024 0.0000000 1.33333333 0
## 3229 84090031 0.0000000 0.16666667 0
## 3231 84090033 0.1555556 0.43333333 14
## 3234 84090036 0.0000000 0.03333333 0
## 3248 84090051 0.0000000 1.55555556 0
## 3250 84090054 0.0000000 0.21111111 0
## 3252 84090056 0.0000000 0.01111111 0
## total.cases.percap total.deaths total.deaths.percap
## 3203 NA 4 NA
## 3206 NA 2 NA
## 3222 NA 120 NA
## 3229 NA 15 NA
## 3231 NA 39 NA
## 3234 NA 3 NA
## 3248 NA 140 NA
## 3250 NA 19 NA
## 3252 NA 1 NA
## total.deaths.percase
## 3203 NA
## 3206 NA
## 3222 NA
## 3229 NA
## 3231 2.785714
## 3234 NA
## 3248 NA
## 3250 NA
## 3252 NA
US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)
data.merge <- merge(US.stats, county_factors, by = "ID")
data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)
data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)
corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)